#delimit cr
*compare similarity of programs in Arancibia et al. dataset
import excel using "${dta_dir}\Teacher training data_CLEAN_2017-01-07.xlsx", ///
	sheet("Coded only - post interviews") cellrange(A3:BG29) firstrow clear

local varstofix design_diagnostic target_geography target_subject target_grade target_experience ///
	target_skills target_contract cost implications posevaluation negevaluation textbooks storybooks computers manuals ///
	lessonplans scripted crafts readers software nteachers nschools years dropout pfocus sfocus subjectfocus lectures ///
	discussion lessons materials diagnostics lessonplanning scriptedlessons cascade trainerprofile consecutive ///
	hourstotaltraining hourslectures hourspstudents hourspteachers nweeks location nteachers_session nvisits ///
	naturevisits distance monthsdistance d
	
foreach var of local varstofix{

	replace `var' = "" if `var' =="-"
	destring `var', replace

}


order d, after(answered_int)

gen id = _n
order id


save "${dir}\Teacher training data_CLEAN_2017-01-07.dta", replace

use "${dir}\Teacher training data_CLEAN_2017-01-07.dta", clear

sort id

*compare all studies to one another
	local n_studies = _N
	
	local varstocompare designer implementor design_diagnostic objectives target_geography target_subject ///
		target_grade target_experience target_skills target_contract cost implications posevaluation negevaluation ///
		textbooks storybooks computers manuals lessonplans scripted crafts readers software years ///
		dropout pfocus sfocus subjectfocus lectures discussion lessons materials diagnostics lessonplanning scriptedlessons ///
		cascade trainerprofile consecutive hourstotaltraining hourslectures hourspstudents hourspteachers nweeks location ///
		nvisits naturevisits distance monthsdistance subject africa interview

	*make little dataset to hold results
	preserve
		drop if _n>=1
		drop _all
		save "${dir}\program_comparisons.dta", replace emptyok
		
	restore

	*loop over all studies
	forvalues i = 1/`n_studies'{

		*loop over all studies with higher id number -- avoids double-counting and self-comparisons
		forvalues j = `i'/`n_studies'{
		
			di "i = `i', j = `j'"
			
			quietly{
				preserve
					*pull out just these two studies
					keep if inlist(id,`i',`j')
					

					gen total_count = 0
					gen total_same = 0
					
					*compare all the vars across obs
					foreach var of local varstocompare{
					
						replace total_count = total_count+1 if (!mi(`var'[1]) & !mi(`var'[2]))
						replace total_same = total_same+1 if (`var'[1]==`var'[2]) & (!mi(`var'[1]) & !mi(`var'[2]))
						
					}
					
					gen abs_difference_in_d = abs(d[1]-d[2])
					
					keep total_count total_same abs_difference_in_d
					duplicates drop
					gen id1 = `i'
					gen id2 = `j'
					append using "${dir}\program_comparisons.dta"
					save "${dir}\program_comparisons.dta", replace
					
				restore
			}
		}
		*end loop over j
	}
	*end loop over i

	
use "${dir}\program_comparisons.dta", clear

*sanity check: make sure program is 100% identical to itself
assert total_count==total_same if id1==id2
drop if id1==id2
mdesc



gen total_different = total_count-total_same
gen percent_different = (total_different)/total_count
order id1 id2 percent_different total_different total_same total_count

sort percent_different

save "${dir}\program_comparisons.dta", replace


#delimit;

graph box percent_different	
		, /*xsize(7) ysize(5)*/
		yscale(range(0 1)) ylabel(0(0.2)1, angle(0))
		yline(0, lcolor(black))
		ytitle("Share of" "Indicators with" "Differences", orientation(horizontal))
		graphregion(color(white) fcolor(white) icolor(white) ifcolor(white) lcolor(white) ilcolor(white))
		plotregion(color(white) fcolor(white) icolor(white) ifcolor(white) lcolor(white) ilcolor(white))			
		;
